Step I. Reading and processing the New York Times (NYT) state-level COVID-19 data
Read in the data
## data extracted from New York Times state-level data from NYT Github repository
# https://github.com/nytimes/covid-19-data
## state-level population information from us_census_data available on GitHub repository:
# https://github.com/COVID19Tracking/associated-data/tree/master/us_census_data
### FINISH THE CODE HERE ###
# load COVID state-level data from NYT
cv_states <- as.data.frame(data.table::fread("https://raw.githubusercontent.com/nytimes/covid-19-data/master/us-states.csv"))
### FINISH THE CODE HERE ###
# load state population data
state_pops <- as.data.frame(data.table::fread("https://raw.githubusercontent.com/COVID19Tracking/associated-data/master/us_census_data/us_census_2018_population_estimates_states.csv"))
state_pops$abb <- state_pops$state
state_pops$state <- state_pops$state_name
state_pops$state_name <- NULL
### FINISH THE CODE HERE
cv_states <- merge(cv_states, state_pops, by = "state")
Look at the data
- Inspect the dimensions, head, and tail of the data
- Inspect the structure of each variables. Are they in the correct format?
knitr::kable(dim(cv_states))
knitr::kable(head(cv_states))
| Alabama |
2020-06-10 |
1 |
21989 |
744 |
1 |
4887871 |
96.50939 |
AL |
| Alabama |
2021-10-31 |
1 |
832047 |
15573 |
1 |
4887871 |
96.50939 |
AL |
| Alabama |
2021-05-26 |
1 |
542831 |
11138 |
1 |
4887871 |
96.50939 |
AL |
| Alabama |
2020-04-19 |
1 |
4903 |
160 |
1 |
4887871 |
96.50939 |
AL |
| Alabama |
2021-07-07 |
1 |
552911 |
11387 |
1 |
4887871 |
96.50939 |
AL |
| Alabama |
2020-06-21 |
1 |
30021 |
839 |
1 |
4887871 |
96.50939 |
AL |
knitr::kable(tail(cv_states))
| 32245 |
Wyoming |
2020-12-01 |
56 |
33805 |
230 |
56 |
577737 |
5.950611 |
WY |
| 32246 |
Wyoming |
2021-08-15 |
56 |
68272 |
793 |
56 |
577737 |
5.950611 |
WY |
| 32247 |
Wyoming |
2021-03-16 |
56 |
55352 |
693 |
56 |
577737 |
5.950611 |
WY |
| 32248 |
Wyoming |
2021-04-12 |
56 |
56988 |
701 |
56 |
577737 |
5.950611 |
WY |
| 32249 |
Wyoming |
2021-04-01 |
56 |
56389 |
700 |
56 |
577737 |
5.950611 |
WY |
| 32250 |
Wyoming |
2020-11-30 |
56 |
33305 |
215 |
56 |
577737 |
5.950611 |
WY |
knitr::kable(str(cv_states))
## 'data.frame': 32250 obs. of 9 variables:
## $ state : chr "Alabama" "Alabama" "Alabama" "Alabama" ...
## $ date : IDate, format: "2020-06-10" "2021-10-31" ...
## $ fips : int 1 1 1 1 1 1 1 1 1 1 ...
## $ cases : int 21989 832047 542831 4903 552911 30021 117242 809485 134417 547135 ...
## $ deaths : int 744 15573 11138 160 11387 839 2037 14869 2285 11252 ...
## $ geo_id : int 1 1 1 1 1 1 1 1 1 1 ...
## $ population : int 4887871 4887871 4887871 4887871 4887871 4887871 4887871 4887871 4887871 4887871 ...
## $ pop_density: num 96.5 96.5 96.5 96.5 96.5 ...
## $ abb : chr "AL" "AL" "AL" "AL" ...
|| || || ||
Add new_cases and new_deaths and correct outliers
- Add variables for new cases, new_cases, and new deaths, new_deaths:
- Hint: You can set new_cases equal to the difference between cases on date i and date i-1, starting on date i=2
- Filter to dates after June 1, 2021
- Use plotly for EDA: See if there are outliers or values that don’t make sense for new_cases and new_deaths. Which states and which dates have strange values?
- Correct outliers: Set negative values for new_cases or new_deaths to 0
- Recalculate cases and deaths as cumulative sum of updated new_cases and new_deaths
- Get the rolling average of new cases and new deaths to smooth over time
- Inspect data again interactively
# Add variables for new_cases and new_deaths:
for (i in 1:length(state_list)) {
cv_subset = subset(cv_states, state == state_list[i])
cv_subset = cv_subset[order(cv_subset$date),]
# add starting level for new cases and deaths
cv_subset$new_cases = cv_subset$cases[1]
cv_subset$new_deaths = cv_subset$deaths[1]
for (j in 2:nrow(cv_subset)) {
cv_subset$new_cases[j] = cv_subset$cases[j] - cv_subset$cases[j-1]
cv_subset$new_deaths[j] = cv_subset$deaths[j] - cv_subset$deaths[j-1]
}
# include in main dataset
cv_states$new_cases[cv_states$state==state_list[i]] = cv_subset$new_cases
cv_states$new_deaths[cv_states$state==state_list[i]] = cv_subset$new_deaths
}
# Focus on recent dates
cv_states <- cv_states %>% dplyr::filter(date >= "2021-06-01")
### FINISH THE CODE HERE
# Inspect outliers in new_cases using plotly
p1<-ggplot(cv_states, aes(x = date, y = new_cases, color = state)) +
geom_line() +
geom_point(size = .5, alpha = 0.5)
ggplotly(p1)
p1<-NULL # to clear from workspace
p2<-ggplot(cv_states, aes(x = date, y = new_deaths, color = state)) +
geom_line() +
geom_point(size = .5, alpha = 0.5)
ggplotly(p2)
p2<-NULL # to clear from workspace
# set negative new case or death counts to 0
cv_states$new_cases[cv_states$new_cases<0] = 0
cv_states$new_deaths[cv_states$new_deaths<0] = 0
# Recalculate `cases` and `deaths` as cumulative sum of updated `new_cases` and `new_deaths`
for (i in 1:length(state_list)) {
cv_subset = subset(cv_states, state == state_list[i])
# add starting level for new cases and deaths
cv_subset$cases = cv_subset$cases[1]
cv_subset$deaths = cv_subset$deaths[1]
### FINISH CODE HERE
for (j in 2:nrow(cv_subset)) {
cv_subset$cases[j] = cv_subset$new_cases[j] + cv_subset$cases[j-1]
cv_subset$deaths[j] = cv_subset$new_deaths[j] + cv_subset$deaths[j-1]
}
# include in main dataset
cv_states$cases[cv_states$state==state_list[i]] = cv_subset$cases
cv_states$deaths[cv_states$state==state_list[i]] = cv_subset$deaths
}
# Smooth new counts
cv_states$new_cases = zoo::rollmean(cv_states$new_cases, k=7, fill=NA, align='right') %>% round(digits = 0)
cv_states$new_deaths = zoo::rollmean(cv_states$new_deaths, k=7, fill=NA, align='right') %>% round(digits = 0)
# Inspect data again interactively
p2<-ggplot(cv_states, aes(x = date, y = new_deaths, color = state)) + geom_line() + geom_point(size = .5, alpha = 0.5)
ggplotly(p2)
#p2=NULL
Add additional variables
Add population-normalized (by 100,000) variables for each variable type (rounded to 1 decimal place). Make sure the variables you calculate are in the correct format (numeric). You can use the following variable names:
- per100k = cases per 100,000 population
- newper100k= new cases per 100,000
- deathsper100k = deaths per 100,000
- newdeathsper100k = new deaths per 100,000
Add a “naive CFR” variable representing deaths / cases on each date for each state
Create a dataframe representing values on the most recent date, cv_states_today, as done in lecture
### FINISH CODE HERE
# add population normalized (by 100,000) counts for each variable
cv_states$per100k = as.numeric(format(round(cv_states$cases/(cv_states$population/100000),1),nsmall=1))
cv_states$newper100k = as.numeric(format(round(cv_states$new_cases/(cv_states$population/100000),1),nsmall=1))
cv_states$deathsper100k = as.numeric(format(round(cv_states$deaths/(cv_states$population/100000),1),nsmall=1))
cv_states$newdeathsper100k = as.numeric(format(round(cv_states$new_deaths/(cv_states$population/100000),1),nsmall=1))
# add a naive_CFR variable = deaths / cases
cv_states = cv_states %>% mutate(naive_CFR = round((deaths*100/cases),2))
# create a `cv_states_today` variable
cv_states_today = subset(cv_states, date==max(cv_states$date))
Explore scatterplots using plot_ly()
- Create a scatterplot using plot_ly() representing pop_density vs. various variables (e.g. cases, per100k, deaths, deathsper100k) for each state on most recent date (cv_states_today)
- Color points by state and size points by state population
- Use hover to identify any outliers.
- Remove those outliers and replot.
- Choose one plot. For this plot:
- Add hoverinfo specifying the state name, cases per 100k, and deaths per 100k, similarly to how we did this in the lecture notes
- Add layout information to title the chart and the axes
- Enable hovermode = “compare”
### FINISH CODE HERE
# pop_density vs. cases
cv_states_today %>%
plot_ly(x = ~pop_density, y = ~cases,
type = 'scatter', mode = 'markers', color = ~state,
size = ~population, sizes = c(5, 70), marker = list(sizemode='diameter', opacity=0.5))
# filter out "District of Columbia"
cv_states_today_filter <- cv_states_today %>% filter(state!="District of Columbia")
# pop_density vs. cases after filtering
cv_states_today_filter %>%
plot_ly(x = ~pop_density, y = ~cases,
type = 'scatter', mode = 'markers', color = ~state,
size = ~population, sizes = c(5, 70), marker = list(sizemode='diameter', opacity=0.5))
# pop_density vs. deathsper100k
cv_states_today_filter %>%
plot_ly(x = ~pop_density, y = ~deathsper100k,
type = 'scatter', mode = 'markers', color = ~state,
size = ~population, sizes = c(5, 70), marker = list(sizemode='diameter', opacity=0.5))
# Adding hoverinfo
cv_states_today_filter %>%
plot_ly(x = ~pop_density, y = ~deathsper100k,
type = 'scatter', mode = 'markers', color = ~state,
size = ~population, sizes = c(5, 70), marker = list(sizemode='diameter', opacity=0.5),
hoverinfo = 'text',
text = ~paste( paste(state, ":", sep=""), paste(" Cases per 100k: ", per100k, sep="") ,
paste(" Deaths per 100k: ", deathsper100k, sep=""), sep = "<br>")) %>%
layout(title = "Population-normalized COVID-19 deaths (per 100k) vs. population density for US states",
yaxis = list(title = "Deaths per 100k"), xaxis = list(title = "Population Density"),
hovermode = "compare")
Explore scatterplot trend interactively using ggplotly() and geom_smooth()
- For pop_density vs. newdeathsper100k create a chart with the same variables using gglot_ly()
- Explore the pattern between and using geom_smooth()
- Explain what you see. Do you think pop_density is a correlate of newdeathsper100k?
### FINISH CODE HERE
p <- ggplot(cv_states_today_filter, aes(x=pop_density, y=deathsper100k, size=population)) +
geom_point() +
geom_smooth()
ggplotly(p)
Multiple line chart
- Create a line chart of the naive_CFR for all states over time using plot_ly()
- Use the zoom and pan tools to inspect the naive_CFR for the states that had an increase in September. How have they changed over time?
- Create one more line chart, for Florida only, which shows new_cases and new_deaths together in one plot. Hint: use add_layer()
- Use hoverinfo to “eyeball” the approximate peak of deaths and peak of cases. What is the time delay between the peak of cases and the peak of deaths?
### FINISH CODE HERE
# Line chart for naive_CFR for all states over time using `plot_ly()`
plot_ly(cv_states, x = ~date, y = ~naive_CFR, color = ~state, type = "scatter", mode = "lines")
### FINISH CODE HERE
# Line chart for Florida showing new_cases and new_deaths together
cv_states %>%
filter(state=="Florida") %>%
plot_ly(x = ~date, y = ~new_cases, type = "scatter", mode = "lines") %>%
add_lines(x = ~date, y = ~new_deaths, type = "scatter", mode = "lines")
Heatmaps
Create a heatmap to visualize new_cases for each state on each date greater than June 1st, 2021 - Start by mapping selected features in the dataframe into a matrix using the tidyr package function pivot_wider(), naming the rows and columns, as done in the lecture notes - Use plot_ly() to create a heatmap out of this matrix. Which states stand out? - Repeat with newper100k variable. Now which states stand out? - Create a second heatmap in which the pattern of new_cases for each state over time becomes more clear by filtering to only look at dates every two weeks
### FINISH CODE HERE
# Map state, date, and new_cases to a matrix
library(tidyr)
cv_states_mat <- cv_states %>% select(state, date, new_cases) %>% filter(date>as.Date("2020-04-01"))
cv_states_mat2 <- as.data.frame(pivot_wider(cv_states_mat, names_from = state, values_from = new_cases))
rownames(cv_states_mat2) <- cv_states_mat2$date
cv_states_mat2$date <- NULL
cv_states_mat2 <- as.matrix(cv_states_mat2)
# Create a heatmap using plot_ly()
plot_ly(x=colnames(cv_states_mat2), y=rownames(cv_states_mat2),
z=~cv_states_mat2,
type="heatmap",
showscale=T)
# Create a second heatmap after filtering to only include dates every other week
filter_dates <- seq(as.Date("2020-04-01"), as.Date("2020-10-01"), by="2 weeks")
cv_states_mat <- cv_states %>% select(state, date, new_cases) %>% filter(date %in% filter_dates)
cv_states_mat2 <- as.data.frame(pivot_wider(cv_states_mat, names_from = state, values_from = new_cases))
rownames(cv_states_mat2) <- cv_states_mat2$date
cv_states_mat2$date <- NULL
cv_states_mat2 <- as.matrix(cv_states_mat2)
# Create a heatmap using plot_ly()
plot_ly(x=colnames(cv_states_mat2), y=rownames(cv_states_mat2),
z=~cv_states_mat2,
type="heatmap",
showscale=T)
Map
- Create a map to visualize the naive_CFR by state on October 15, 2021
- Compare with a map visualizing the naive_CFR by state on most recent date
- Plot the two maps together using subplot(). Make sure the shading is for the same range of values (google is your friend for this)
- Describe the difference in the pattern of the CFR.
### For specified date
pick.date = "2021-10-15"
# Extract the data for each state by its abbreviation
cv_per100 <- cv_states %>% filter(date==pick.date) %>% select(state, abb, newper100k, cases, deaths) # select data
cv_per100$state_name <- cv_per100$state
cv_per100$state <- cv_per100$abb
cv_per100$abb <- NULL
# Create hover text
cv_per100$hover <- with(cv_per100, paste(state_name, '<br>', "Cases per 100k: ", newper100k, '<br>', "Cases: ", cases, '<br>', "Deaths: ", deaths))
# Set up mapping details
set_map_details <- list(
scope = 'usa',
projection = list(type = 'albers usa'),
showlakes = TRUE,
lakecolor = toRGB('white')
)
# Make sure both maps are on the same color scale
shadeLimit <- 125
# Create the map
fig <- plot_geo(cv_per100, locationmode = 'USA-states') %>%
add_trace(
z = ~newper100k, text = ~hover, locations = ~state,
color = ~newper100k, colors = 'Purples'
)
fig <- fig %>% colorbar(title = paste0("Cases per 100k: ", pick.date), limits = c(0,shadeLimit))
fig <- fig %>% layout(
title = paste('Cases per 100k by State as of ', pick.date, '<br>(Hover for value)'),
geo = set_map_details
)
fig_pick.date <- fig
#############
### Map for today's date
# Extract the data for each state by its abbreviation
cv_per100 <- cv_states_today %>% select(state, abb, newper100k, cases, deaths) # select data
cv_per100$state_name <- cv_per100$state
cv_per100$state <- cv_per100$abb
cv_per100$abb <- NULL
# Create hover text
cv_per100$hover <- with(cv_per100, paste(state_name, '<br>', "Cases per 100k: ", newper100k, '<br>', "Cases: ", cases, '<br>', "Deaths: ", deaths))
# Set up mapping details
set_map_details <- list(
scope = 'usa',
projection = list(type = 'albers usa'),
showlakes = TRUE,
lakecolor = toRGB('white')
)
# Create the map
fig <- plot_geo(cv_per100, locationmode = 'USA-states') %>%
add_trace(
z = ~newper100k, text = ~hover, locations = ~state,
color = ~newper100k, colors = 'Purples'
)
fig <- fig %>% colorbar(title = paste0("Cases per 100k: ", Sys.Date()), limits = c(0,shadeLimit))
fig <- fig %>% layout(
title = paste('Cases per 100k by State as of', Sys.Date(), '<br>(Hover for value)'),
geo = set_map_details
)
fig_Today <- fig
### Plot together
subplot(fig_pick.date, fig_Today, nrows = 2, margin = .05)